{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib as mpl\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.ensemble import BaggingRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "mpl.rcParams['font.sans-serif'] = [u'simHei']\n",
    "mpl.rcParams['axes.unicode_minus'] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 设置在jupyter中matplotlib的显示情况\n",
    "%matplotlib tk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## 文件存储路径\n",
    "path1 = \"C0904.csv\"\n",
    "path2 = \"C0911.csv\"\n",
    "filename = \"X12CO2\" # H2O"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "### 原始数据读取\n",
    "plt.figure(figsize=(10, 6), facecolor='w')\n",
    "plt.subplot(121)\n",
    "data = pd.read_csv(path1, header=0)\n",
    "x = data[filename].values\n",
    "plt.plot(x, 'r-', lw=1, label=u'C0904')\n",
    "plt.title(u'实际数据0904', fontsize=18)\n",
    "plt.legend(loc='upper right')\n",
    "plt.xlim(0, 80000)\n",
    "plt.grid(b=True)\n",
    "\n",
    "plt.subplot(122)\n",
    "data = pd.read_csv(path2, header=0)\n",
    "x = data[filename].values\n",
    "plt.plot(x, 'r-', lw=1, label=u'C0911')\n",
    "plt.title(u'实际数据0911', fontsize=18)\n",
    "plt.legend(loc='upper right')\n",
    "plt.xlim(0, 80000)\n",
    "plt.grid(b=True)\n",
    "\n",
    "plt.tight_layout(2, rect=(0, 0, 1, 0.95))\n",
    "plt.suptitle(u'如何找到下图中的异常值', fontsize=20)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "### 异常数据处理\n",
    "data = pd.read_csv(path2, header=0)\n",
    "x = data[filename].values\n",
    "\n",
    "width = 300\n",
    "delta = 5\n",
    "eps = 0.02\n",
    "N = len(x)\n",
    "p = []\n",
    "# 异常值存储\n",
    "abnormal = []\n",
    "for i in np.arange(0, N-width, delta):\n",
    "    s = x[i:i+width]\n",
    "    ## 获取max-min的差值\n",
    "    min_s = np.min(s)\n",
    "    ptp = np.ptp(s) \n",
    "    ptp_min = ptp / min_s\n",
    "    p.append(ptp_min)\n",
    "    ## 如果差值大于给定的阈值认为是\n",
    "    if ptp_min > eps:\n",
    "        abnormal.append(range(i, i+width))\n",
    "## 获得异常的数据x值        \n",
    "abnormal = np.array(abnormal).flatten()\n",
    "abnormal = np.unique(abnormal)\n",
    "#plt.plot(p, lw=1)\n",
    "#plt.grid(b=True)\n",
    "#plt.show()\n",
    "\n",
    "plt.figure(figsize=(18, 7), facecolor='w')\n",
    "plt.subplot(131)\n",
    "plt.plot(x, 'r-', lw=1, label=u'原始数据')\n",
    "plt.title(u'实际数据', fontsize=18)\n",
    "plt.legend(loc='upper right')\n",
    "plt.xlim(0, 80000)\n",
    "plt.grid(b=True)\n",
    "\n",
    "plt.subplot(132)\n",
    "t = np.arange(N)\n",
    "plt.plot(t, x, 'r-', lw=1, label=u'原始数据')\n",
    "plt.plot(abnormal, x[abnormal], 'go', markeredgecolor='g', ms=3, label=u'异常值')\n",
    "plt.legend(loc='upper right')\n",
    "plt.title(u'异常数据检测', fontsize=18)\n",
    "plt.xlim(0, 80000)\n",
    "plt.grid(b=True)\n",
    "\n",
    "# 预测\n",
    "plt.subplot(133)\n",
    "select = np.ones(N, dtype=np.bool)\n",
    "select[abnormal] = False\n",
    "t = np.arange(N)\n",
    "## 决策树\n",
    "dtr = DecisionTreeRegressor(criterion='mse', max_depth=10)\n",
    "br = BaggingRegressor(dtr, n_estimators=10, max_samples=0.3)\n",
    "## 模型训练\n",
    "br.fit(t[select].reshape(-1, 1), x[select])\n",
    "## 模型预测得出结果\n",
    "y = br.predict(np.arange(N).reshape(-1, 1))\n",
    "y[select] = x[select]\n",
    "plt.plot(x, 'g--', lw=1, label=u'原始值')    # 原始值\n",
    "plt.plot(y, 'r-', lw=1, label=u'校正值')     # 校正值\n",
    "plt.legend(loc='upper right')\n",
    "plt.title(u'异常值校正', fontsize=18)\n",
    "plt.xlim(0, 80000)\n",
    "plt.grid(b=True)\n",
    "\n",
    "plt.tight_layout(1.5, rect=(0, 0, 1, 0.95))\n",
    "plt.suptitle(u'异常值检测与校正', fontsize=22)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
